Análise descritiva¶

Projeto PI-4

In [1]:
import pandas as pd
import geopandas as gpd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
2024-11-17 23:40:49.697671: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-17 23:40:49.698271: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-17 23:40:49.700420: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-17 23:40:49.706678: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-17 23:40:49.717067: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-17 23:40:49.720124: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-17 23:40:49.727968: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-17 23:40:50.257268: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
In [2]:
df=pd.read_csv("br-capes-btd-2021-2023-10-31.csv", encoding='ISO-8859-1', delimiter=';')
In [3]:
df_columns= pd.DataFrame(df.columns, columns=['Estados'])
In [4]:
df['DT_TITULACAO'].head(1)
Out[4]:
0    30JUN2021:00:00:00
Name: DT_TITULACAO, dtype: object
In [5]:
df_columns
Out[5]:
Estados
0 AN_BASE
1 CD_PROGRAMA
2 NM_PROGRAMA
3 CD_ENTIDADE_CAPES
4 SG_ENTIDADE_ENSINO
5 NM_ENTIDADE_ENSINO
6 ID_ADD_PRODUCAO_INTELECTUAL
7 ID_PRODUCAO_INTELECTUAL
8 NM_PRODUCAO
9 ID_SUBTIPO_PRODUCAO
10 NM_SUBTIPO_PRODUCAO
11 ID_AREA_CONCENTRACAO
12 NM_AREA_CONCENTRACAO
13 ID_LINHA_PESQUISA
14 NM_LINHA_PESQUISA
15 ID_PROJETO
16 NM_PROJETO
17 DH_INICIO_AREA_CONC
18 DH_FIM_AREA_CONC
19 DH_INICIO_LINHA
20 DH_FIM_LINHA
21 DT_TITULACAO
22 DS_PALAVRA_CHAVE
23 DS_ABSTRACT
24 DS_KEYWORD
25 IN_TRABALHO_MESMA_AREA
26 NM_TP_VINCULO
27 IN_ORIENT_PARTICIPOU_BANCA
28 DS_BIBLIOTECA_DEPOSITARIA
29 ID_TP_EXPECTATIVA_ATUACAO
30 NM_EXPECTATIVA_ATUACAO
31 ID_PESSOA_DISCENTE
32 NM_DISCENTE
33 DT_MATRICULA
34 ID_GRAU_ACADEMICO
35 NM_GRAU_ACADEMICO
36 NM_ORIENTADOR
37 DS_CATEGORIA_ORIENTADOR
38 NM_CATEGORIA_DOCENTE
39 NM_REGIAO
40 SG_UF_IES
41 NM_UF_IES
42 CD_GRANDE_AREA_CONHECIMENTO
43 NM_GRANDE_AREA_CONHECIMENTO
44 CD_AREA_CONHECIMENTO
45 NM_AREA_CONHECIMENTO
46 CD_SUBAREA_CONHECIMENTO
47 NM_SUBAREA_CONHECIMENTO
48 CD_ESPECIALIDADE
49 NM_ESPECIALIDADE
50 NM_AREA_AVALIACAO
51 NR_VOLUME
52 NR_PAGINAS
53 NM_IDIOMA
54 DS_RESUMO
55 DS_URL_TEXTO_COMPLETO
56 ID_PESSOA_ORIENTADOR
57 IN_TCC_COM_VINCULO_PRODUCAO
58 ID_ADD_PRODUCAO_VINCULO_CT
  • Medidas tendencia central
  • Boxplot
  • Histograma
  • Curtose, assimetria
  • Variação intraclasse -> determinar correlação
  • Variação interclasse -> determinar correlação
  • Coeficiente de crammer -> variaveis categoricas

Grafico

  • Grafico de linhas

    • Numero de trabalhos publicados ao longo do tempo, cresce ou aumenta?
  • Grafico de barras

    • comparar as classes de trabalhos - áreas
  • Grafico de bolhas - geográfico

  • Mapas de calor para verificar quais as dias do ano mais pessoas defendem ao longo dos anos

  • Tree map alternativa ao grafico de pizza

  • Grafico de barras um do lado da outra para comparar defesa de mulheres e homens (2021,2022,2023)

  • dados cíclicos - grafico radar

  • Mapa cloroplético -

correlações

  • INVESTIMENTO FEDERAL em educação vs numero de pos graduandos

Trabalhos por estado¶

In [6]:
df_grouped=pd.DataFrame(df.groupby('NM_UF_IES').size())
df_grouped = df_grouped.reset_index(names='Count')
df_grouped.columns = ['Estados', 'Count']
In [7]:
fig = px.bar(df_grouped.sort_values(by='Count', ascending=False), x='Estados', y='Count', title='Distribuição de trabalhos por estado')
fig.update_layout(yaxis_title='Count', xaxis_title='Estado')
fig.show()
In [8]:
shapefile_path = "BR_UF_2022/BR_UF_2022.shp"
gdf = gpd.read_file(shapefile_path)
gdf['centroid'] = gdf.geometry.centroid

gdf.head(30)
/tmp/ipykernel_2871610/4091591602.py:3: UserWarning:

Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.


Out[8]:
CD_UF NM_UF SIGLA_UF NM_REGIAO AREA_KM2 geometry centroid
0 12 Acre AC Norte 164173.429 POLYGON ((-68.79282 -10.99957, -68.79367 -10.9... POINT (-70.47293 -9.21327)
1 13 Amazonas AM Norte 1559255.881 POLYGON ((-56.76292 -3.23221, -56.76789 -3.242... POINT (-64.65345 -4.15411)
2 15 Pará PA Norte 1245870.704 MULTIPOLYGON (((-48.97548 -0.19834, -48.97487 ... POINT (-53.07149 -3.98042)
3 16 Amapá AP Norte 142470.762 MULTIPOLYGON (((-51.04561 -0.05088, -51.05422 ... POINT (-51.96202 1.44746)
4 17 Tocantins TO Norte 277423.627 POLYGON ((-48.2483 -13.19239, -48.24844 -13.19... POINT (-48.3313 -10.14808)
5 21 Maranhão MA Nordeste\n 329651.496 MULTIPOLYGON (((-44.5868 -2.23341, -44.58696 -... POINT (-45.28777 -5.07221)
6 22 Piauí PI Nordeste\n 251755.481 POLYGON ((-42.47034 -3.48377, -42.46126 -3.484... POINT (-42.97045 -7.3893)
7 23 Ceará CE Nordeste\n 148894.447 POLYGON ((-37.87162 -4.3664, -37.87109 -4.3670... POINT (-39.61579 -5.09322)
8 24 Rio Grande do Norte RN Nordeste\n 52809.599 MULTIPOLYGON (((-35.18728 -5.78987, -35.18707 ... POINT (-36.67327 -5.8398)
9 25 Paraíba PB Nordeste\n 56467.242 MULTIPOLYGON (((-34.7958 -7.175, -34.79578 -7.... POINT (-36.83246 -7.12104)
10 26 Pernambuco PE Nordeste\n 98067.877 MULTIPOLYGON (((-35.04823 -8.60936, -35.04756 ... POINT (-37.99768 -8.32522)
11 27 Alagoas AL Nordeste\n 27830.661 MULTIPOLYGON (((-35.287 -9.14489, -35.28699 -9... POINT (-36.62485 -9.51367)
12 28 Sergipe SE Nordeste\n 21938.188 MULTIPOLYGON (((-37.01203 -10.92784, -37.01267... POINT (-37.44379 -10.58376)
13 29 Bahia BA Nordeste\n 564760.429 MULTIPOLYGON (((-39.26447 -8.61413, -39.26341 ... POINT (-41.72116 -12.47533)
14 31 Minas Gerais MG Sudeste\n 586513.983 POLYGON ((-42.51148 -14.98627, -42.50964 -14.9... POINT (-44.67336 -18.45618)
15 32 Espírito Santo ES Sudeste\n 46074.448 MULTIPOLYGON (((-40.27883 -20.33437, -40.27883... POINT (-40.66851 -19.57518)
16 33 Rio de Janeiro RJ Sudeste\n 43750.425 MULTIPOLYGON (((-42.00612 -22.88563, -42.00634... POINT (-42.66278 -22.19572)
17 35 São Paulo SP Sudeste\n 248219.485 MULTIPOLYGON (((-46.47312 -22.70498, -46.47289... POINT (-48.72896 -22.26584)
18 41 Paraná PR Sul\n 199298.981 MULTIPOLYGON (((-48.30974 -25.49328, -48.27691... POINT (-51.61664 -24.63588)
19 42 Santa Catarina SC Sul\n 95730.690 MULTIPOLYGON (((-49.23653 -26.03711, -49.2365 ... POINT (-50.47471 -27.2474)
20 43 Rio Grande do Sul RS Sul\n 281707.151 MULTIPOLYGON (((-51.71873 -31.85463, -51.71941... POINT (-53.24515 -29.78646)
21 50 Mato Grosso do Sul MS Centro-oeste\n 357142.082 POLYGON ((-54.68379 -23.8305, -54.68569 -23.83... POINT (-54.84556 -20.32733)
22 51 Mato Grosso MT Centro-oeste\n 903208.361 POLYGON ((-56.0716 -17.17062, -56.07246 -17.17... POINT (-55.91228 -12.94898)
23 52 Goiás GO Centro-oeste\n 340242.859 POLYGON ((-47.33502 -15.58733, -47.33512 -15.5... POINT (-49.62251 -16.04119)
24 53 Distrito Federal DF Centro-oeste\n 5760.784 POLYGON ((-48.01472 -16.04996, -48.01573 -16.0... POINT (-47.79685 -15.78117)
25 11 Rondônia RO Norte 237754.172 POLYGON ((-62.60021 -13.01675, -62.59999 -13.0... POINT (-62.84196 -10.91314)
26 14 Roraima RR Norte 223644.530 POLYGON ((-60.12972 4.50843, -60.1296 4.50826,... POINT (-61.39191 2.08271)
In [9]:
df_sg=pd.DataFrame(df.groupby('SG_UF_IES').size())
df_sg = df_sg.reset_index(names='Count')
df_sg.columns = ['SIGLA_UF', 'Count']
df_sg.head()
Out[9]:
SIGLA_UF Count
0 AC 215
1 AL 676
2 AM 860
3 AP 139
4 BA 3045
In [10]:
gdf_joined = gdf.merge(df_sg, how="left", on="SIGLA_UF")
gdf.head(30)
Out[10]:
CD_UF NM_UF SIGLA_UF NM_REGIAO AREA_KM2 geometry centroid
0 12 Acre AC Norte 164173.429 POLYGON ((-68.79282 -10.99957, -68.79367 -10.9... POINT (-70.47293 -9.21327)
1 13 Amazonas AM Norte 1559255.881 POLYGON ((-56.76292 -3.23221, -56.76789 -3.242... POINT (-64.65345 -4.15411)
2 15 Pará PA Norte 1245870.704 MULTIPOLYGON (((-48.97548 -0.19834, -48.97487 ... POINT (-53.07149 -3.98042)
3 16 Amapá AP Norte 142470.762 MULTIPOLYGON (((-51.04561 -0.05088, -51.05422 ... POINT (-51.96202 1.44746)
4 17 Tocantins TO Norte 277423.627 POLYGON ((-48.2483 -13.19239, -48.24844 -13.19... POINT (-48.3313 -10.14808)
5 21 Maranhão MA Nordeste\n 329651.496 MULTIPOLYGON (((-44.5868 -2.23341, -44.58696 -... POINT (-45.28777 -5.07221)
6 22 Piauí PI Nordeste\n 251755.481 POLYGON ((-42.47034 -3.48377, -42.46126 -3.484... POINT (-42.97045 -7.3893)
7 23 Ceará CE Nordeste\n 148894.447 POLYGON ((-37.87162 -4.3664, -37.87109 -4.3670... POINT (-39.61579 -5.09322)
8 24 Rio Grande do Norte RN Nordeste\n 52809.599 MULTIPOLYGON (((-35.18728 -5.78987, -35.18707 ... POINT (-36.67327 -5.8398)
9 25 Paraíba PB Nordeste\n 56467.242 MULTIPOLYGON (((-34.7958 -7.175, -34.79578 -7.... POINT (-36.83246 -7.12104)
10 26 Pernambuco PE Nordeste\n 98067.877 MULTIPOLYGON (((-35.04823 -8.60936, -35.04756 ... POINT (-37.99768 -8.32522)
11 27 Alagoas AL Nordeste\n 27830.661 MULTIPOLYGON (((-35.287 -9.14489, -35.28699 -9... POINT (-36.62485 -9.51367)
12 28 Sergipe SE Nordeste\n 21938.188 MULTIPOLYGON (((-37.01203 -10.92784, -37.01267... POINT (-37.44379 -10.58376)
13 29 Bahia BA Nordeste\n 564760.429 MULTIPOLYGON (((-39.26447 -8.61413, -39.26341 ... POINT (-41.72116 -12.47533)
14 31 Minas Gerais MG Sudeste\n 586513.983 POLYGON ((-42.51148 -14.98627, -42.50964 -14.9... POINT (-44.67336 -18.45618)
15 32 Espírito Santo ES Sudeste\n 46074.448 MULTIPOLYGON (((-40.27883 -20.33437, -40.27883... POINT (-40.66851 -19.57518)
16 33 Rio de Janeiro RJ Sudeste\n 43750.425 MULTIPOLYGON (((-42.00612 -22.88563, -42.00634... POINT (-42.66278 -22.19572)
17 35 São Paulo SP Sudeste\n 248219.485 MULTIPOLYGON (((-46.47312 -22.70498, -46.47289... POINT (-48.72896 -22.26584)
18 41 Paraná PR Sul\n 199298.981 MULTIPOLYGON (((-48.30974 -25.49328, -48.27691... POINT (-51.61664 -24.63588)
19 42 Santa Catarina SC Sul\n 95730.690 MULTIPOLYGON (((-49.23653 -26.03711, -49.2365 ... POINT (-50.47471 -27.2474)
20 43 Rio Grande do Sul RS Sul\n 281707.151 MULTIPOLYGON (((-51.71873 -31.85463, -51.71941... POINT (-53.24515 -29.78646)
21 50 Mato Grosso do Sul MS Centro-oeste\n 357142.082 POLYGON ((-54.68379 -23.8305, -54.68569 -23.83... POINT (-54.84556 -20.32733)
22 51 Mato Grosso MT Centro-oeste\n 903208.361 POLYGON ((-56.0716 -17.17062, -56.07246 -17.17... POINT (-55.91228 -12.94898)
23 52 Goiás GO Centro-oeste\n 340242.859 POLYGON ((-47.33502 -15.58733, -47.33512 -15.5... POINT (-49.62251 -16.04119)
24 53 Distrito Federal DF Centro-oeste\n 5760.784 POLYGON ((-48.01472 -16.04996, -48.01573 -16.0... POINT (-47.79685 -15.78117)
25 11 Rondônia RO Norte 237754.172 POLYGON ((-62.60021 -13.01675, -62.59999 -13.0... POINT (-62.84196 -10.91314)
26 14 Roraima RR Norte 223644.530 POLYGON ((-60.12972 4.50843, -60.1296 4.50826,... POINT (-61.39191 2.08271)
In [11]:
fig, ax = plt.subplots(1, 1, figsize=(15, 12))
gdf_joined.plot(column="Count", cmap='OrRd', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)

for idx, row in gdf.iterrows():
    ax.text(row["centroid"].x, row["centroid"].y, row["SIGLA_UF"], fontsize=8, ha='center')

plt.show()
No description has been provided for this image
In [12]:
bar_height=20
margin_height=40
chart_height=(bar_height*len(df_grouped['Estados'])) + margin_height

df_grouped_sorted = df_grouped.sort_values(by='Count')

fig = go.Figure(go.Bar(
    x= df_grouped_sorted['Count'],
    y=df_grouped_sorted['Estados'],
    orientation='h'
))

fig.update_layout(
    title='Distribuição de trabalhos por estado',
    xaxis_title='Estado',
    yaxis_title='Número de trabalhos',
    height= chart_height,
    margin=dict(t=50,b=50)
)

fig.show()

Distribuição por área de conhecimento¶

In [13]:
df_grouped_areas_con=df.groupby('NM_GRANDE_AREA_CONHECIMENTO').size()
In [14]:
df_grouped_areas_con.head()
Out[14]:
NM_GRANDE_AREA_CONHECIMENTO
CIÊNCIAS AGRÁRIAS              7139
CIÊNCIAS BIOLÓGICAS            4335
CIÊNCIAS DA SAÚDE             12563
CIÊNCIAS EXATAS E DA TERRA     6442
CIÊNCIAS HUMANAS              14011
dtype: int64
In [15]:
df_grouped_areas_con= df_grouped_areas_con.reset_index()
df_grouped_areas_con.columns= ['Area_conhecimento', 'Count']
In [16]:
df_grouped_areas_con.head()
Out[16]:
Area_conhecimento Count
0 CIÊNCIAS AGRÁRIAS 7139
1 CIÊNCIAS BIOLÓGICAS 4335
2 CIÊNCIAS DA SAÚDE 12563
3 CIÊNCIAS EXATAS E DA TERRA 6442
4 CIÊNCIAS HUMANAS 14011
In [ ]:
fig = px.bar(df_grouped_areas_con.sort_values(by='Count', ascending=False), x='Area_conhecimento', y='Count', title='Histograma Vertical')
fig.update_layout(yaxis_title='Count', xaxis_title='Area_conhecimento')
fig.show()
In [ ]:
 
In [18]:
colors = px.colors.qualitative.Pastel
fig2 = go.Figure(data=[go.Pie(labels=df_grouped_areas_con['Area_conhecimento'], values=df_grouped_areas_con['Count'], hole=0.4, marker=dict(colors=colors))])
fig2.show()
In [ ]:
 

Distribuição temporal¶

In [19]:
from datetime import datetime
In [20]:
df['DT_TITULACAO'] = df['DT_TITULACAO'].apply(lambda x: datetime.strptime(x,'%d%b%Y:%H:%M:%S'))
In [21]:
df['DT_TITULACAO'] = df['DT_TITULACAO'].apply(lambda x: datetime.strftime(x,'%d%m%Y'))
In [22]:
df.groupby('DT_TITULACAO')
Out[22]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7653c0250450>

Processamento de Linguagem Natural¶

In [23]:
import nltk
from nltk.tokenize import word_tokenize
from pathlib import Path

resumo_df = df[["DS_RESUMO","NM_GRANDE_AREA_CONHECIMENTO"]]
resumo_df.columns = ["resumo", "area"]
resumo_df['tokens'] = resumo_df["resumo"].apply(lambda x: word_tokenize(x))

resumo_df.head()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[23], line 1
----> 1 import nltk
      2 from nltk.tokenize import word_tokenize
      3 from pathlib import Path

ModuleNotFoundError: No module named 'nltk'
In [58]:
stopwords = {word.strip() for word in set(Path("stopwords.txt").read_text().split("\n"))}
stopwords
Out[58]:
{'a',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquilo',
 'as',
 'até',
 'com',
 'como',
 'da',
 'das',
 'de',
 'dela',
 'delas',
 'dele',
 'deles',
 'depois',
 'do',
 'dos',
 'e',
 'ela',
 'elas',
 'ele',
 'eles',
 'em',
 'entre',
 'era',
 'eram',
 'essa',
 'essas',
 'esse',
 'esses',
 'esta',
 'estamos',
 'estas',
 'estava',
 'estavam',
 'este',
 'esteja',
 'estejam',
 'estejamos',
 'estes',
 'esteve',
 'estive',
 'estivemos',
 'estiver',
 'estivera',
 'estiveram',
 'estiverem',
 'estivermos',
 'estivesse',
 'estivessem',
 'estivéramos',
 'estivéssemos',
 'estou',
 'está',
 'estávamos',
 'estão',
 'eu',
 'foi',
 'fomos',
 'for',
 'fora',
 'foram',
 'forem',
 'formos',
 'fosse',
 'fossem',
 'fui',
 'fôramos',
 'fôssemos',
 'haja',
 'hajam',
 'hajamos',
 'havemos',
 'havia',
 'hei',
 'houve',
 'houvemos',
 'houver',
 'houvera',
 'houveram',
 'houverei',
 'houverem',
 'houveremos',
 'houveria',
 'houveriam',
 'houvermos',
 'houverá',
 'houverão',
 'houveríamos',
 'houvesse',
 'houvessem',
 'houvéramos',
 'houvéssemos',
 'há',
 'hão',
 'isso',
 'isto',
 'já',
 'lhe',
 'lhes',
 'mais',
 'mas',
 'me',
 'mesmo',
 'meu',
 'meus',
 'minha',
 'minhas',
 'muito',
 'na',
 'nas',
 'nem',
 'no',
 'nos',
 'nossa',
 'nossas',
 'nosso',
 'nossos',
 'num',
 'numa',
 'não',
 'nós',
 'o',
 'os',
 'ou',
 'para',
 'pela',
 'pelas',
 'pelo',
 'pelos',
 'por',
 'qual',
 'quando',
 'que',
 'quem',
 'se',
 'seja',
 'sejam',
 'sejamos',
 'sem',
 'ser',
 'serei',
 'seremos',
 'seria',
 'seriam',
 'será',
 'serão',
 'seríamos',
 'seu',
 'seus',
 'somos',
 'sou',
 'sua',
 'suas',
 'são',
 'só',
 'também',
 'te',
 'tem',
 'temos',
 'tenha',
 'tenham',
 'tenhamos',
 'tenho',
 'ter',
 'terei',
 'teremos',
 'teria',
 'teriam',
 'terá',
 'terão',
 'teríamos',
 'teu',
 'teus',
 'teve',
 'tinha',
 'tinham',
 'tive',
 'tivemos',
 'tiver',
 'tivera',
 'tiveram',
 'tiverem',
 'tivermos',
 'tivesse',
 'tivessem',
 'tivéramos',
 'tivéssemos',
 'tu',
 'tua',
 'tuas',
 'tém',
 'têm',
 'tínhamos',
 'um',
 'uma',
 'você',
 'vocês',
 'vos',
 'à',
 'às',
 'é',
 'éramos'}
In [59]:
resumo_df["filtered"] = resumo_df["tokens"].apply(lambda x: " ".join([word for word in x if word.lower() not in stopwords]))
resumo_df.head()
Out[59]:
resumo area tokens filtered
0 O TERRIT�RIO AMAZ�NICO � RECONHECIDO PELA SUA ... MULTIDISCIPLINAR [O, TERRIT�RIO, AMAZ�NICO, �, RECONHECIDO, PEL... TERRIT�RIO AMAZ�NICO � RECONHECIDO GRANDE BIOD...
1 A RELA��O ENTRE O HOMEM E AS PLANTAS FOI ESTAB... MULTIDISCIPLINAR [A, RELA��O, ENTRE, O, HOMEM, E, AS, PLANTAS, ... RELA��O HOMEM PLANTAS ESTABELECIDA DESDE PRIM�...
2 A UTILIZA��O DE MICRO-ORGANISMOS ENDOF�TICOS C... MULTIDISCIPLINAR [A, UTILIZA��O, DE, MICRO-ORGANISMOS, ENDOF�TI... UTILIZA��O MICRO-ORGANISMOS ENDOF�TICOS FONTE ...
3 OS FUNGOS FILAMENTOSOS S�O CONSIDERADOS BOAS F... MULTIDISCIPLINAR [OS, FUNGOS, FILAMENTOSOS, S�O, CONSIDERADOS, ... FUNGOS FILAMENTOSOS S�O CONSIDERADOS BOAS FONT...
4 A MAL�RIA � UMA DAS DOEN�AS MAIS FATAIS QUE AF... MULTIDISCIPLINAR [A, MAL�RIA, �, UMA, DAS, DOEN�AS, MAIS, FATAI... MAL�RIA � DOEN�AS FATAIS AFETA HUMANIDADE . DU...
In [61]:
resumo_df['area'].unique()
Out[61]:
array(['MULTIDISCIPLINAR'], dtype=object)
In [60]:
import wordcloud

output = Path("results")
output.mkdir(parents=True, exist_ok=True)

grouped_area = resumo_df.groupby("area")["filtered"].apply(' '.join).reset_index()

for index, row in grouped_area.iterrows():

    filename = output / f"{row['area']}.png"

    word = wordcloud.WordCloud(width=800, height=400, background_color="white", stopwords=stopwords).generate(row['filtered'])

    word.to_file(filename)
In [ ]:
 

Usando berts para categorizar¶

In [101]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
from sklearn.model_selection import train_test_split
import shutil
import random
In [27]:
df.groupby(['NM_GRANDE_AREA_CONHECIMENTO']).size()
Out[27]:
NM_GRANDE_AREA_CONHECIMENTO
CIÊNCIAS AGRÁRIAS               7139
CIÊNCIAS BIOLÓGICAS             4335
CIÊNCIAS DA SAÚDE              12563
CIÊNCIAS EXATAS E DA TERRA      6442
CIÊNCIAS HUMANAS               14011
CIÊNCIAS SOCIAIS APLICADAS     12377
ENGENHARIAS                     7580
LINGÜÍSTICA, LETRAS E ARTES     5380
MULTIDISCIPLINAR               12411
dtype: int64
In [28]:
data = df[['DS_ABSTRACT','CD_GRANDE_AREA_CONHECIMENTO' ] ]
In [29]:
data = data.rename(columns={'DS_ABSTRACT': 'X', 'CD_GRANDE_AREA_CONHECIMENTO': 'Y'})
In [30]:
data['X'].head()
Out[30]:
0    THE AMAZON TERRITORY IS RECOGNIZED FOR ITS GRE...
1    THE RELATIONSHIP BETWEEN HUMANITY AND PLANTS H...
2    DUE TO THEIR VERSATILE METABOLITES, THE USE OF...
3    FILAMENTOUS FUNGI ARE CONSIDERED GOOD SOURCES ...
4    MALARIA IS ONE OF THE MOST FATAL DISEASES AFFE...
Name: X, dtype: object
In [74]:
data_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True).dropna()  # frac=1 means shuffle all data
In [75]:
train_size = int(0.8 * len(data_shuffled))
In [91]:
df_train_val = data_shuffled[:train_size]  # Training set (80%)
df_test = data_shuffled[train_size:] 
In [92]:
df_train= df_train_val[:train_size]
df_val = df_train_val[train_size:]
In [78]:
# X_train_val, X_test, y_train_val, y_test = train_test_split(df['X'], df['Y'], test_size=0.2, random_state=42)

# x_train, x_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
In [79]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42
In [93]:
X = df_train['X'].str.replace(r'[^A-Za-z0-9\s]+', '', regex=True)
Y = df_train['Y'].values
df_val = df_val['X'].str.replace(r'[^A-Za-z0-9\s]+', '', regex=True)
df_test = df_test['X'].str.replace(r'[^A-Za-z0-9\s]+', '', regex=True)
In [82]:
vectorizer = tf.keras.layers.TextVectorization(max_tokens=30000, output_sequence_length=100)
In [83]:
vectorizer.adapt(X)
In [84]:
train_ds = tf.data.Dataset.from_tensor_slices((X, Y))
In [86]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)  # Expand dims to add an extra dimension for the vectorizer
    return vectorizer(text), label
In [87]:
dataset = tf.data.Dataset.from_tensor_slices((X, Y))
In [88]:
vectorized_ds = dataset.map(vectorize_text)
In [89]:
vectorized_ds = vectorized_ds.batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)
In [96]:
for text_batch, label_batch in vectorized_ds.take(1):
    for i in range(1):
        print(f"Abstract: {text_batch.numpy()[1]}")
        label=label_batch.numpy()[i]
        print(f"label: {label}")
Abstract: [[    2  4350  1385   515    16     2    94  3624  4920    14  1153   109
      2  2453     3  2253 15354  1487  6970  2964     8     2     1     1
      4  4912  3806    16     2  2376 11308  4920     2  4350  1385  3136
    110 11941  1249   555     4  1721     2   555 10566    45     7  2654
      2  4350  1385   515    16     2    94  3624  4920    14  1153   109
      2  2453     3  2253 15354  1487  6970  2964     8     2     1     1
      4  4912  3806    16     2  2376 11308  4920     2  4350  1385  3136
    110 11941  1249   555     4  1721     2   555 10566    45     7 12731
   8189     3  1603     4]]
label: 40000001
2024-10-21 21:17:00.966064: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
In [102]:
#@title ##Escolher um modelo BERT pré-treinado (TensorFlow Hub)

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'
#bert_model_name = 'small_bert/bert_en_uncased_L-2_H-128_A-2'

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'modelo BERT selecionado           : {tfhub_handle_encoder}')
print(f'Modelo de pré-processamento auto-selecionado: {tfhub_handle_preprocess}')
modelo BERT selecionado           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Modelo de pré-processamento auto-selecionado: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
In [105]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
In [ ]:
 
In [ ]: